Analysis of Global Life Expectancy¶

INFSCI 2415: Information Visualization Midterm Project¶

Vishruth Reddy¶

vpr8@pitt.edu¶

  • Data for the project is obtained from Kaggle.
  • Link to the dataset: "https://www.kaggle.com/datasets/lashagoch/life-expectancy-who-updated/data"
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import plotly.express as px
In [2]:
df = pd.read_csv("Life-Expectancy-Data-Updated.csv")
df.head()
Out[2]:
Country Region Year Infant_deaths Under_five_deaths Adult_mortality Alcohol_consumption Hepatitis_B Measles BMI ... Diphtheria Incidents_HIV GDP_per_capita Population_mln Thinness_ten_nineteen_years Thinness_five_nine_years Schooling Economy_status_Developed Economy_status_Developing Life_expectancy
0 Turkiye Middle East 2015 11.1 13.0 105.8240 1.32 97 65 27.8 ... 97 0.08 11006 78.53 4.9 4.8 7.8 0 1 76.5
1 Spain European Union 2015 2.7 3.3 57.9025 10.35 97 94 26.0 ... 97 0.09 25742 46.44 0.6 0.5 9.7 1 0 82.8
2 India Asia 2007 51.5 67.9 201.0765 1.57 60 35 21.2 ... 64 0.13 1076 1183.21 27.1 28.0 5.0 0 1 65.4
3 Guyana South America 2006 32.8 40.5 222.1965 5.68 93 74 25.3 ... 93 0.79 4146 0.75 5.7 5.5 7.9 0 1 67.0
4 Israel Middle East 2012 3.4 4.3 57.9510 2.89 97 89 27.0 ... 94 0.08 33995 7.91 1.2 1.1 12.8 1 0 81.7

5 rows × 21 columns

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2864 entries, 0 to 2863
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Country                      2864 non-null   object 
 1   Region                       2864 non-null   object 
 2   Year                         2864 non-null   int64  
 3   Infant_deaths                2864 non-null   float64
 4   Under_five_deaths            2864 non-null   float64
 5   Adult_mortality              2864 non-null   float64
 6   Alcohol_consumption          2864 non-null   float64
 7   Hepatitis_B                  2864 non-null   int64  
 8   Measles                      2864 non-null   int64  
 9   BMI                          2864 non-null   float64
 10  Polio                        2864 non-null   int64  
 11  Diphtheria                   2864 non-null   int64  
 12  Incidents_HIV                2864 non-null   float64
 13  GDP_per_capita               2864 non-null   int64  
 14  Population_mln               2864 non-null   float64
 15  Thinness_ten_nineteen_years  2864 non-null   float64
 16  Thinness_five_nine_years     2864 non-null   float64
 17  Schooling                    2864 non-null   float64
 18  Economy_status_Developed     2864 non-null   int64  
 19  Economy_status_Developing    2864 non-null   int64  
 20  Life_expectancy              2864 non-null   float64
dtypes: float64(11), int64(8), object(2)
memory usage: 470.0+ KB
In [4]:
df.isna().sum()
Out[4]:
Country                        0
Region                         0
Year                           0
Infant_deaths                  0
Under_five_deaths              0
Adult_mortality                0
Alcohol_consumption            0
Hepatitis_B                    0
Measles                        0
BMI                            0
Polio                          0
Diphtheria                     0
Incidents_HIV                  0
GDP_per_capita                 0
Population_mln                 0
Thinness_ten_nineteen_years    0
Thinness_five_nine_years       0
Schooling                      0
Economy_status_Developed       0
Economy_status_Developing      0
Life_expectancy                0
dtype: int64
In [5]:
df.describe()
Out[5]:
Year Infant_deaths Under_five_deaths Adult_mortality Alcohol_consumption Hepatitis_B Measles BMI Polio Diphtheria Incidents_HIV GDP_per_capita Population_mln Thinness_ten_nineteen_years Thinness_five_nine_years Schooling Economy_status_Developed Economy_status_Developing Life_expectancy
count 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000 2864.000000
mean 2007.500000 30.363792 42.938268 192.251775 4.820882 84.292598 77.344972 25.032926 86.499651 86.271648 0.894288 11540.924930 36.675915 4.865852 4.899825 7.632123 0.206704 0.793296 68.856075
std 4.610577 27.538117 44.569974 114.910281 3.981949 15.995511 18.659693 2.193905 15.080365 15.534225 2.381389 16934.788931 136.485867 4.438234 4.525217 3.171556 0.405012 0.405012 9.405608
min 2000.000000 1.800000 2.300000 49.384000 0.000000 12.000000 10.000000 19.800000 8.000000 16.000000 0.010000 148.000000 0.080000 0.100000 0.100000 1.100000 0.000000 0.000000 39.400000
25% 2003.750000 8.100000 9.675000 106.910250 1.200000 78.000000 64.000000 23.200000 81.000000 81.000000 0.080000 1415.750000 2.097500 1.600000 1.600000 5.100000 0.000000 1.000000 62.700000
50% 2007.500000 19.600000 23.100000 163.841500 4.020000 89.000000 83.000000 25.500000 93.000000 93.000000 0.150000 4217.000000 7.850000 3.300000 3.400000 7.800000 0.000000 1.000000 71.400000
75% 2011.250000 47.350000 66.000000 246.791375 7.777500 96.000000 93.000000 26.400000 97.000000 97.000000 0.460000 12557.000000 23.687500 7.200000 7.300000 10.300000 0.000000 1.000000 75.400000
max 2015.000000 138.100000 224.900000 719.360500 17.870000 99.000000 99.000000 32.100000 99.000000 99.000000 21.680000 112418.000000 1379.860000 27.700000 28.600000 14.100000 1.000000 1.000000 83.800000
In [6]:
df.describe(include=object)
Out[6]:
Country Region
count 2864 2864
unique 179 9
top Turkiye Africa
freq 16 816
In [7]:
df.nunique()
Out[7]:
Country                         179
Region                            9
Year                             16
Infant_deaths                   847
Under_five_deaths              1035
Adult_mortality                2850
Alcohol_consumption            1164
Hepatitis_B                      80
Measles                          87
BMI                             120
Polio                            77
Diphtheria                       80
Incidents_HIV                   393
GDP_per_capita                 2564
Population_mln                 1803
Thinness_ten_nineteen_years     200
Thinness_five_nine_years        207
Schooling                       130
Economy_status_Developed          2
Economy_status_Developing         2
Life_expectancy                 396
dtype: int64
In [8]:
fig, ax = plt.subplots(figsize=(8, 6))

sns.boxplot(data=df, x='Region', y='Life_expectancy', palette='bright', ax=ax, showmeans=True,
            meanprops={'marker': 'o', 'markerfacecolor': 'white', 'markeredgecolor': 'black'})

ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

ax.set_xlabel(None)
ax.set_ylabel("Life expectancy")
ax.set_title("Life expectancy by region")

legend_elements = [plt.Line2D([0],[0], color=sns.color_palette('bright')[i], lw=2, label=region)
                   for i, region in enumerate(df['Region'].unique())]
ax.legend(handles=legend_elements, title="Regions", loc="upper right", bbox_to_anchor=(1.5, 1))

plt.show()
In [9]:
sns.catplot(data = df, x='Region',y='Life_expectancy', kind='violin', aspect=2)
plt.xticks(rotation=90)

plt.show()

The above box plots (or box-and-whisker plots) are a graphical representation of my dataset's summary statistics. The above boxplot shows the life expectancy of the people in various continents.

  • Middle East, South America, Central America, and Oceania have outliers.
  • Africa has the widest and the lowest range of life expectancy whereas, North America has the narrowest and the highest range of life expectancy compared to the other regions.
  • The legend on the side displays the color used to represent each region in the boxplots.

About the boxplots:

1. Box: The central part of the plot is a rectangular box. The box represents the interquartile range (IQR), which includes the middle 50% of the data. The bottom and top of the box mark the first quartile (Q1) and third quartile (Q3), respectively. The width of the box shows the spread of the data within this middle 50%.

  • Q1 (First Quartile): The 25th percentile of the data, i.e., the value below which 25% of the data falls.
  • Q3 (Third Quartile): The 75th percentile of the data, i.e., the value below which 75% of the data falls.
  • IQR (Interquartile Range): The range between Q1 and Q3, representing the middle 50% of the data.

2. Median: A vertical line or marker inside the box represents the median (Q2) of the data, which is the middle value when the data is sorted.

  • Q2 (Median): The middle value of the data set.

3. Whiskers: Lines extending from the top and bottom of the box, indicate the range of the data.

  • Whisker Range: The range between the whiskers, which typically covers the entire data range if whis is set to [0, 100].

4. Outliers: Any data points outside the whiskers are considered outliers and are usually plotted as individual points. Outliers are values that are significantly different from the rest of the data and can provide insights into data anomalies.

In [10]:
plt.figure(figsize=(15,10))

sns.heatmap(data=df.corr(), annot=True)

plt.show()
/var/folders/27/m6jr_n_d2tx7504nd_ywk5b40000gn/T/ipykernel_20757/3381459498.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
  sns.heatmap(data=df.corr(), annot=True)
In [11]:
average_timeline = df.groupby('Year')['Life_expectancy'].mean()
average_compare = df[df['Year'].isin([2000, 2015])].groupby(['Year', 'Region','Country'])['Life_expectancy'].median()
In [12]:
sns.lineplot(data = average_timeline)
plt.xlabel('Years')
plt.ylabel('Life expectancy')
plt.title('Average life expectancy around the world, since 2000-2015.')
plt.show()

sns.lineplot(data = df[df['Region'].isin(['Africa', 'Asia', 'North America'])], x = 'Year', y = 'Life_expectancy', hue = 'Region')
plt.xlabel('Years')
plt.ylabel('Life expectancy')
plt.title('Average life expectancy around the world, since 2000-2015.')
plt.show()
In [13]:
sns.kdeplot(data = df[df['Year'].isin([2000, 2015])], x = 'Life_expectancy',
            hue = 'Year', fill = True, palette = sns.color_palette('BrBG', 2))
plt.axvline(x = df[df['Year'] == 2000]['Life_expectancy'].median(), linestyle = '--', color = 'tan')
plt.axvline(x = df[df['Year'] == 2015]['Life_expectancy'].median(), linestyle = '--', color = 'darkcyan')
plt.text(df[df['Year'] == 2000]['Life_expectancy'].median() + 2, y = 0.015,
         s = 'Median life expectancy at year 2000', color = 'tan')
plt.text(df[df['Year'] == 2015]['Life_expectancy'].median() + 2, y = 0.020,
         s = 'Median life expectancy at year 2015', color = 'darkcyan')
plt.xlabel('Life expectancy')
plt.title('Average life expectancy of people around the world comparison between 2000 and 2015.')
plt.show()
In [14]:
sns.pointplot(data = df[df['Year'].isin([2000, 2015])], x = 'Region', y = 'Life_expectancy',
              hue = 'Year', estimator = np.median, palette = sns.color_palette('BrBG', 2))
plt.xlabel('Regions')
plt.ylabel('Life expectancy')
plt.xticks(rotation = 90)
plt.title('Average life expectancy of population in each region comparison between 2000 and 2015')
plt.show()
In [15]:
sns.barplot(data = df[df['Year'].isin([2000, 2015])], x = 'Year', y = 'Infant_deaths', hue = 'Region')
plt.ylabel('Infant death rates')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')
plt.show()
In [16]:
df_copy3 = df.copy()
df_copy3 = df_copy3[['Schooling', 'Economy_status_Developed', 'Economy_status_Developing']]
df_copy3.head()
Out[16]:
Schooling Economy_status_Developed Economy_status_Developing
0 7.8 0 1
1 9.7 1 0
2 5.0 0 1
3 7.9 0 1
4 12.8 1 0
In [17]:
sns.catplot(data = df, x='Region', y='Life_expectancy', hue='Economy_status_Developed', kind='box', aspect=2)
plt.xticks(rotation=90)

plt.show()
In [18]:
df_copy1 = df.copy()
df_copy1 = df_copy1.drop(columns=['Year', 'Adult_mortality', 'Alcohol_consumption', 'Incidents_HIV', 'BMI', 'GDP_per_capita', 'Population_mln', 'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed', 'Economy_status_Developing'])
df_copy1.head()
Out[18]:
Country Region Infant_deaths Under_five_deaths Hepatitis_B Measles Polio Diphtheria Life_expectancy
0 Turkiye Middle East 11.1 13.0 97 65 97 97 76.5
1 Spain European Union 2.7 3.3 97 94 97 97 82.8
2 India Asia 51.5 67.9 60 35 67 64 65.4
3 Guyana South America 32.8 40.5 93 74 92 93 67.0
4 Israel Middle East 3.4 4.3 97 89 94 94 81.7
In [19]:
sns.pairplot(data = df_copy1, diag_kws={'common_norm': False})

plt.show()
In [20]:
df_copy2 = df.copy()
df_copy2 = df_copy2.drop(columns=['Year', 'Infant_deaths', 'Hepatitis_B', 'Under_five_deaths', 'Diphtheria', 'Measles', 'Polio', 'GDP_per_capita', 'Population_mln', 'Thinness_ten_nineteen_years', 'Thinness_five_nine_years', 'Schooling', 'Economy_status_Developed', 'Economy_status_Developing'])
df_copy2.head()
Out[20]:
Country Region Adult_mortality Alcohol_consumption BMI Incidents_HIV Life_expectancy
0 Turkiye Middle East 105.8240 1.32 27.8 0.08 76.5
1 Spain European Union 57.9025 10.35 26.0 0.09 82.8
2 India Asia 201.0765 1.57 21.2 0.13 65.4
3 Guyana South America 222.1965 5.68 25.3 0.79 67.0
4 Israel Middle East 57.9510 2.89 27.0 0.08 81.7
In [21]:
sns.pairplot(data = df_copy2, hue='Life_expectancy', diag_kws={'common_norm': False}, palette='viridis')

plt.show()
In [22]:
sns.jointplot(data = df, x='Life_expectancy', y='Schooling', hue='Economy_status_Developed',
              kind='kde',
              marginal_kws={'common_norm': False})


plt.show()
In [23]:
sns.jointplot(data = df, x='Life_expectancy', y='Infant_deaths', hue='Economy_status_Developed',
              marginal_kws={'common_norm': False})

plt.show()
In [24]:
sns.scatterplot(data = df, y='Polio', x='Infant_deaths', hue='Region')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')

plt.show()
In [25]:
sns.scatterplot(data = df, y='Diphtheria', x='Infant_deaths', hue='Region')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')

plt.show()
In [26]:
sns.scatterplot(data = df, y='BMI', x='Adult_mortality', hue='Region')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')

plt.show()
In [27]:
sns.scatterplot(data = df, y='BMI', x='Adult_mortality', hue='Region')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left')

plt.show()
In [28]:
sns.pointplot(data = df[df['Year'].isin([2015])], x = 'Region', y = 'GDP_per_capita',
              hue = 'Year', estimator = np.median, palette = sns.color_palette('BrBG', 2))
plt.xlabel('Regions')
plt.ylabel('GDP Per Capita')
plt.xticks(rotation = 90)
# plt.title('Average life expectancy of population in each region comparison between 2000 and 2015')
plt.show()